Zeynep Erdem
</p>
</div>
Partie 1
Partie 2 : Clusters pour Récence Fréquence Montant
Partie 3 : Clusters pour RFM + Review(satisfaction)
Partie 4 : Clusters pour RFM + Review(satisfaction) + Deliver duration (durée livraison) sans ACP
Partie 5 : Clusters pour RFM + Review(satisfaction) + Deliver duration (durée livraison) avec ACP
Partie 5 : Conclusion et Récommandations
pwd
'/Users/zeyneperdem/Desktop/ds_prep/p5/livrables'
# Installation des libraries
# !pip install pandas
# !pip install missingno
# !pip install matplotlib
# !pip install seaborn
# !pip install numpy
# !pip install black
# Importation des libraries
import sys
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import missingno as msno
import sklearn
import scipy
import plotly
import warnings
import functions
from functions import *
from category_encoders import TargetEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
from yellowbrick.cluster import SilhouetteVisualizer
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.cluster.elbow import kelbow_visualizer
from sklearn.metrics import silhouette_score, davies_bouldin_score, adjusted_rand_score
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_samples
from plotly import __version__
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.express as px
import cufflinks as cf
import plotly.io as pio
pio.renderers.default = "notebook"
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn import preprocessing
from scipy.cluster.hierarchy import dendrogram
init_notebook_mode(connected=True)
cf.go_offline()
%matplotlib inline
warnings.filterwarnings("ignore")
# Les versions des libraries
print("Python: ", sys.version)
print("Pandas: ", pd.__version__)
print("Numpy: ", np.__version__)
print("Seaborn: ", sns.__version__)
print("Matplotlib: ", matplotlib.__version__)
print("Missingno: ", msno.__version__)
print("Sklearn: ", sklearn.__version__)
print("Scipy: ", scipy.__version__)
print("Plotly: ", plotly.__version__)
Python: 3.9.16 (main, May 16 2023, 14:27:50) [Clang 14.0.6 ] Pandas: 2.0.2 Numpy: 1.24.3 Seaborn: 0.12.2 Matplotlib: 3.7.1 Missingno: 0.5.2 Sklearn: 1.2.2 Scipy: 1.10.1 Plotly: 5.15.0
# Les options pour visualisations des dataframes
pd.set_option("display.max_columns", None)
# pd.set_option('display.max_colwidth', 1200)
# pd.set_option('display.max_rows', 200)
# black example.py
# black <file_or_directory_path>
path1 = "data/rfm_plus_geo.csv"
rfm_plus_date = pd.read_csv(path1)
rfm_plus_date.head(2)
| customer_unique_id | Recency | Frequency | MonetaryValue | review_mean | deliver_duration_mean | last_order_purchase_timestamp | customer_lat | customer_lng | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 1 | 141.90 | 5.0 | 6.0 | 2018-05-10 10:56:27 | -23.335331 | -46.828647 |
| 1 | 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 1 | 27.19 | 4.0 | 3.0 | 2018-05-07 11:11:27 | -23.567395 | -46.792957 |
rfm_plus = rfm_plus_date.drop(
["last_order_purchase_timestamp", "customer_lat", "customer_lng"], axis=1
)
rfm_plus.head(3)
| customer_unique_id | Recency | Frequency | MonetaryValue | review_mean | deliver_duration_mean | |
|---|---|---|---|---|---|---|
| 0 | 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 1 | 141.90 | 5.0 | 6.0 |
| 1 | 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 1 | 27.19 | 4.0 | 3.0 |
| 2 | 0000f46a3911fa3c0805444483337064 | 585 | 1 | 86.22 | 3.0 | 25.0 |
info_general(rfm_plus)
************************* * INFORMATIONS GENERALES DE NOTRE DATAFRAME * ************************* ---------------------------------------------------------------------------------------------------- Il y a 96095 lignes et 6 colonnes dans notre dataframe. ---------------------------------------------------------------------------------------------------- * On obtiens l'info sur notre dataframe * ---------------------------------------------------------------------------------------------------- <class 'pandas.core.frame.DataFrame'> RangeIndex: 96095 entries, 0 to 96094 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customer_unique_id 96095 non-null object 1 Recency 96095 non-null int64 2 Frequency 96095 non-null int64 3 MonetaryValue 96095 non-null float64 4 review_mean 96095 non-null float64 5 deliver_duration_mean 96095 non-null float64 dtypes: float64(3), int64(2), object(1) memory usage: 4.4+ MB None ---------------------------------------------------------------------------------------------------- * On obtient les informations statistiques sur notre dataframe * ----------------------------------------------------------------------------------------------------
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| customer_unique_id | 96095 | 96095 | 0000366f3b9a7992bf8c76cfdf3221e2 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Recency | 96095.0 | NaN | NaN | NaN | 287.730756 | 153.407846 | 0.0 | 163.0 | 268.0 | 397.0 | 772.0 |
| Frequency | 96095.0 | NaN | NaN | NaN | 1.034809 | 0.214385 | 1.0 | 1.0 | 1.0 | 1.0 | 17.0 |
| MonetaryValue | 96095.0 | NaN | NaN | NaN | 166.594226 | 231.428912 | 0.0 | 63.12 | 108.0 | 183.53 | 13664.08 |
| review_mean | 96095.0 | NaN | NaN | NaN | 4.092044 | 1.338673 | 1.0 | 4.0 | 5.0 | 5.0 | 5.0 |
| deliver_duration_mean | 96095.0 | NaN | NaN | NaN | 12.041088 | 9.415758 | 0.0 | 6.0 | 10.0 | 15.0 | 209.0 |
---------------------------------------------------------------------------------------------------- *On vérifie s'il y a des doublons dans notre dataframe* ** Il n'y a pas de doublons ** ---------------------------------------------------------------------------------------------------- ** On vérifie s'il y a des valeurs nulles et on display pourcentage des valeurs nulles par colonne en descendant ** ---------------------------------------------------------------------------------------------------- ** On a 0.0 % de valeur nulle dans notre dataframe ** ----------------------------------------------------------------------------------------------------
rfm_plus.set_index("customer_unique_id", inplace=True)
rfm_plus.head(2)
| Recency | Frequency | MonetaryValue | review_mean | deliver_duration_mean | |
|---|---|---|---|---|---|
| customer_unique_id | |||||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 1 | 141.90 | 5.0 | 6.0 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 1 | 27.19 | 4.0 | 3.0 |
rfm = rfm_plus_date.drop(
[
"last_order_purchase_timestamp",
"customer_lat",
"customer_lng",
"review_mean",
"deliver_duration_mean",
],
axis=1,
)
rfm.set_index("customer_unique_id", inplace=True)
rfm.head(2)
| Recency | Frequency | MonetaryValue | |
|---|---|---|---|
| customer_unique_id | |||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 1 | 141.90 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 1 | 27.19 |
X = rfm
X_scaled = StandardScaler().fit_transform(X)
df_X_scaled = pd.DataFrame(X_scaled, columns=rfm.columns, index=rfm.index)
df_X_scaled.head(2)
| Recency | Frequency | MonetaryValue | |
|---|---|---|---|
| customer_unique_id | |||
| 0000366f3b9a7992bf8c76cfdf3221e2 | -0.832626 | -0.162369 | -0.106704 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | -0.813071 | -0.162369 | -0.602366 |
K-means est un algorithme de regroupement non supervisé utilisé pour diviser un ensemble de données en K clusters. Il fonctionne en attribuant chaque point de données au centre de cluster le plus proche, puis en recalculant les centres et en réaffectant les points jusqu'à la convergence. K-means vise à minimiser la variance intra-cluster et maximiser la variance inter-cluster. Il est rapide et efficace, mais sensible aux valeurs initiales et peut avoir du mal avec des formes de clusters complexes. Il est largement utilisé dans divers domaines tels que l'analyse de données et la segmentation d'images.
# Méthode du coude pour choisir le nombre de classes
model = KMeans(init="k-means++", random_state=3)
visualizer = KElbowVisualizer(model, k=(2, 20), timings=False, njobs=-1)
visualizer.fit(X_scaled)
visualizer.show();
# On choisit 6 clusters
print(X_scaled.shape)
silhouettes= []
for num_clusters in range (2,10):
cls=KMeans(n_clusters=num_clusters, init="k-means++", random_state=3)
cls.fit(X_scaled)
silh=metrics.silhouette_score(X_scaled, cls.labels_)
silhouettes.append(silh)
plt.plot(range(2,10),silhouettes, marker="o");
(96095, 3)
Avec les résultats des scores de silhouette, on peut également choisir le nombre de clusters, mais le calcul prend plus de temps.
# Nombre de clusters souhaités
n_clust = 6
# Clustering par K-means
km = KMeans(n_clusters=n_clust, init="k-means++", random_state=3)
km.fit(X_scaled)
# Récupération des clusters attribués à chaque individu
clusters_km = km.labels_
def plot_clusters(X_data, clusters):
# Create a scatter plot of the data points, colored by cluster assignment
fig = plt.figure(figsize=(8, 5))
scatter = plt.scatter(
X_data.iloc[:, 0], X_data.iloc[:, 1], c=clusters, cmap="jet", alpha=1
)
# Generate class names for the legend
unique_clusters = sorted(set(clusters))
classes = [f"Cls {cluster}" for cluster in unique_clusters]
# Set up legend
legend_handles = scatter.legend_elements()[0]
legend_labels = classes[: len(legend_handles)]
plt.legend(
handles=legend_handles,
labels=legend_labels,
loc="upper right",
fontsize="x-small",
)
plt.title(
"Projection des {} individus sur le 1e plan factoriel".format(X_data.shape[0])
)
# Set x-axis label
plt.xlabel(X_data.columns[0])
# Set y-axis label
plt.ylabel(X_data.columns[1])
plt.show()
plot_clusters(rfm, clusters_km)
rfm_km = rfm.copy()
rfm_km["Clusters"] = clusters_km
rfm_km.head(2)
| Recency | Frequency | MonetaryValue | Clusters | |
|---|---|---|---|---|
| customer_unique_id | ||||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 1 | 141.90 | 2 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 1 | 27.19 | 2 |
# Changement 'Clusters' à string
rfm_km["Clusters"] = rfm_km["Clusters"].astype(str)
# Visualization de resultat
fig = px.scatter_3d(
rfm_km, x="Recency", y="Frequency", z="MonetaryValue", color=rfm_km["Clusters"]
)
fig.show()
# KMeans a bien défini les clusters avec le graphique 3D
rfm_km["Clusters"].value_counts()
Clusters 2 33881 0 33508 3 21152 5 4081 1 2968 4 505 Name: count, dtype: int64
rfm_km.groupby(by="Clusters").mean()
| Recency | Frequency | MonetaryValue | |
|---|---|---|---|
| Clusters | |||
| 0 | 304.550048 | 1.000000 | 121.785808 |
| 1 | 268.303908 | 2.115566 | 292.065735 |
| 2 | 135.775154 | 1.000000 | 126.159841 |
| 3 | 508.911167 | 1.000000 | 127.178945 |
| 4 | 286.689109 | 1.067327 | 2168.202950 |
| 5 | 279.057339 | 1.000000 | 735.547430 |
def calculate_cluster_statistics(data):
cluster_value_counts = (
data["Clusters"].value_counts().reset_index().sort_values("Clusters")
)
cluster_value_counts["count"] = cluster_value_counts["count"] / 1000
cluster_means = data.groupby(by="Clusters").mean().reset_index()
df = pd.merge(cluster_means, cluster_value_counts, on="Clusters", how="inner")
df.columns = list(cluster_means.columns) + ["Nb_of_clients(k)"]
df = df.set_index("Clusters")
return df
result_rfm_km = calculate_cluster_statistics(rfm_km)
result_rfm_km
| Recency | Frequency | MonetaryValue | Nb_of_clients(k) | |
|---|---|---|---|---|
| Clusters | ||||
| 0 | 304.550048 | 1.000000 | 121.785808 | 33.508 |
| 1 | 268.303908 | 2.115566 | 292.065735 | 2.968 |
| 2 | 135.775154 | 1.000000 | 126.159841 | 33.881 |
| 3 | 508.911167 | 1.000000 | 127.178945 | 21.152 |
| 4 | 286.689109 | 1.067327 | 2168.202950 | 0.505 |
| 5 | 279.057339 | 1.000000 | 735.547430 | 4.081 |
fig = plt.figure(figsize=(20, 8))
sns.set(font_scale=1.5)
plt.title("La Heatmap avec les Clusters Kmeans RFM", size=25)
sns.heatmap(result_rfm_km, annot=True, cmap="Reds", fmt=".3f");
def plot_cluster_boxplots(data):
"""
Plot boxplots for each variable against the cluster labels.
Args:
data (pandas.DataFrame): The DataFrame containing the data and cluster assignments.
The DataFrame should have a column named 'Clusters' containing the cluster labels,
and other columns representing the variables to be plotted.
"""
sns.set(font_scale=1.2)
cluster_column = "Clusters"
variable_columns = data.drop(cluster_column, axis=1).columns
data = data.sort_values(cluster_column) # Sort data by the cluster column
n_clusters = len(
data[cluster_column].unique()
) # Get the number of unique cluster labels
fig, axes = plt.subplots(1, len(variable_columns), figsize=(20, 6))
fig.suptitle("Clusters vs other variables")
for i, variable in enumerate(variable_columns):
sns.boxplot(ax=axes[i], data=data, x=cluster_column, y=variable)
axes[i].set_xticklabels(
range(n_clusters)
) # Use range(n_clusters) as tick labels
plt.show()
plot_cluster_boxplots(rfm_km)
# On constate les mêmes résultats avec le heatmap et les boxplots
Le score de silhouette est une mesure d'évaluation couramment utilisée pour quantifier la qualité des clusters dans l'analyse de regroupement. Il fournit une estimation de la cohésion intra-cluster et de la séparation inter-cluster.
Calcul du score de silhouette global : Le score de silhouette global est obtenu en prenant la moyenne des scores de silhouette pour tous les points de données. Il fournit une mesure globale de la qualité des clusters, variant de -1 à 1. Un score proche de 1 indique des clusters bien séparés et cohérents, un score proche de 0 indique une superposition ou une ambiguïté des clusters, et un score proche de -1 indique une mauvaise classification des points.
Cependant, il convient de noter que le score de silhouette peut ne pas être toujours fiable dans certains scénarios, notamment lorsque les données ont des formes géométriques complexes ou lorsque les clusters ont des tailles inégales.
Calcul du score de Davies-Bouldin : Le score de Davies-Bouldin est obtenu en prenant la moyenne des ratios de la similarité inter-cluster et de la cohésion intra-cluster pour tous les clusters. Un score plus bas indique une meilleure séparation entre les clusters et une meilleure cohésion intra-cluster, ce qui correspond à une meilleure qualité des clusters.
Cependant, il peut être sensible à la dimensionnalité des données et à la taille des clusters.
# Compute the Silhouette Score
silhouette = silhouette_score(X_scaled, km.labels_).round(2)
# Compute the Davies-Bouldin Score
davies_bouldin = davies_bouldin_score(X_scaled, km.labels_).round(2)
res_kmeans_rfm = {
"Silhouette Score": silhouette,
"Davies-Bouldin Score": davies_bouldin,
}
index = ["KMeans_rfm"]
res_kmeans_rfm = pd.DataFrame(res_kmeans_rfm, index=index)
res_kmeans_rfm
| Silhouette Score | Davies-Bouldin Score | |
|---|---|---|
| KMeans_rfm | 0.44 | 0.71 |
res_kmeans_rfm = {
"Silhouette Score": silhouette,
"Davies-Bouldin Score": davies_bouldin,
}
index = ["KMeans_rfm"]
res_kmeans_rfm = pd.DataFrame(res_kmeans_rfm, index=index)
res_kmeans_rfm
| Silhouette Score | Davies-Bouldin Score | |
|---|---|---|
| KMeans_rfm | 0.44 | 0.71 |
plt.figure(figsize=(10, 5))
visualizer = SilhouetteVisualizer(km, colors="yellowbrick", njobs=-1)
visualizer.fit(X_scaled)
visualizer.poof();
Les clusters ont un score de silhouette supérieur à la moyenne.
Ils sont plutôt équilibrés, à l'exception des clusters qui contiennent des valeurs extrêmes.
Le CAH (Clustering Hierarchical Agglomératif) est un algorithme de regroupement hiérarchique utilisé pour diviser un ensemble de données en clusters de manière progressive. Il fusionne les clusters similaires jusqu'à obtenir un seul cluster global. Il permet d'explorer les structures de clusters complexes mais peut être coûteux en termes de temps de calcul. Le nombre de clusters est déterminé en fonction de la hiérarchie des fusions.
On considère tout d'abord que chaque client est un cluster. Il y a donc autant de clusters que de client. Ensuite, on cherche les deux clusters les plus proches, et on les agglomère en un seul cluster. On répète cette étape jusqu'à ce que tous les clients soient regroupés en un seul grand cluster.
Comme le temps de calcul est très long avec l'analyse hiérarchique ascendante (CAH), nous avons effectué un échantillonnage de 10 000 individus.
# rfm_plus_exp = rfm_plus_date.sample(10000)
# rfm_plus_exp.to_csv("rfm_plus_exp.csv", index=False)
rfm_plus_exp = pd.read_csv("rfm_plus_exp.csv")
rfm_plus_exp.head(2)
| customer_unique_id | Recency | Frequency | MonetaryValue | review_mean | deliver_duration_mean | last_order_purchase_timestamp | customer_lat | customer_lng | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | b55fb2799bf92225fc54fdb4cb7b82e2 | 318 | 1 | 33.72 | 5.0 | 5.0 | 2017-12-03 02:24:54 | -23.662035 | -46.432956 |
| 1 | 91ed74cc8fc470081c5b71fe5016c89f | 199 | 1 | 783.28 | 5.0 | 9.0 | 2018-04-01 12:30:49 | -21.766477 | -48.831547 |
rfm_exp = rfm_plus_exp.drop(
[
"last_order_purchase_timestamp",
"customer_lat",
"customer_lng",
"review_mean",
"deliver_duration_mean",
],
axis=1,
)
rfm_exp.set_index("customer_unique_id", inplace=True)
rfm_exp.head(2)
| Recency | Frequency | MonetaryValue | |
|---|---|---|---|
| customer_unique_id | |||
| b55fb2799bf92225fc54fdb4cb7b82e2 | 318 | 1 | 33.72 |
| 91ed74cc8fc470081c5b71fe5016c89f | 199 | 1 | 783.28 |
X_scaled_rfm_exp = StandardScaler().fit_transform(rfm_exp)
df_X_scaled_rfm_exp = pd.DataFrame(
X_scaled_rfm_exp, columns=rfm_exp.columns, index=rfm_exp.index
)
df_X_scaled_rfm_exp.head(2)
| Recency | Frequency | MonetaryValue | |
|---|---|---|---|
| customer_unique_id | |||
| b55fb2799bf92225fc54fdb4cb7b82e2 | 0.211420 | -0.153988 | -0.597826 |
| 91ed74cc8fc470081c5b71fe5016c89f | -0.564669 | -0.153988 | 2.805209 |
# import de l'échantillon et des informations relatives aux cours
plt.rcParams["axes.grid"] = False
# préparation des données pour le clustering
names = rfm_exp.index
# Clustering hiérarchique
Z = linkage(X_scaled_rfm_exp, "ward")
# Affichage du dendrogramme
%pylab inline
plt.figure(figsize=(25, 8))
plt.title("Hierarchical Clustering Dendrogram", size=35)
plt.xlabel("distance")
dendrogram(Z, labels=names, leaf_font_size=10, orientation="top")
plt.axhline(y=16.4, color="r", linestyle="--")
plt.show()
%pylab is deprecated, use %matplotlib inline and import the required libraries. Populating the interactive namespace from numpy and matplotlib
%pylab inline
plt.rcParams["axes.grid"] = False
plt.figure(figsize=(10, 5))
plt.title("Hierarchical Clustering Dendrogram", size=20)
plt.xlabel("distance")
dendrogram(
Z,
labels=names,
p=6,
truncate_mode="lastp",
leaf_font_size=10,
orientation="top",
show_contracted=True,
)
plt.axhline(y=16.4, color="r", linestyle="--")
plt.show()
%pylab is deprecated, use %matplotlib inline and import the required libraries. Populating the interactive namespace from numpy and matplotlib
# On peut voir le nombre de clients par chaque cluster
# On voit les labels des clusters
labels_cah = fcluster(Z, 49, criterion="distance")
rfm_cah = rfm_exp.copy()
rfm_cah["Clusters"] = labels_cah
calculate_cluster_statistics(rfm_cah)
| Recency | Frequency | MonetaryValue | Nb_of_clients(k) | |
|---|---|---|---|---|
| Clusters | ||||
| 1 | 274.848837 | 2.000000 | 249.007403 | 0.258 |
| 2 | 230.625000 | 3.583333 | 445.756250 | 0.024 |
| 3 | 474.621781 | 1.000000 | 108.607645 | 2.718 |
| 4 | 252.230088 | 1.048673 | 1249.301903 | 0.226 |
| 5 | 138.910567 | 1.000000 | 112.554499 | 3.634 |
| 6 | 295.398408 | 1.000000 | 188.688255 | 3.140 |
fig = plt.figure(figsize=(20, 8))
sns.set(font_scale=1.5)
plt.title("La Heatmap avec les Clusters avec CAH", size=25)
sns.heatmap(calculate_cluster_statistics(rfm_cah), annot=True, cmap="Reds", fmt=".3f");
# Certaines clusters ont la même définition que celles obtenues avec KMeans, ce qui est rassurant pour nous.
# Compute the Silhouette Score
silhouette = silhouette_score(X_scaled_rfm_exp, labels_cah).round(2)
# Compute the Davies-Bouldin Score
davies_bouldin = davies_bouldin_score(X_scaled_rfm_exp, labels_cah).round(2)
res_cah_rfm = {"Silhouette Score": silhouette, "Davies-Bouldin Score": davies_bouldin}
index = ["CAH_rfm"]
res_cah_rfm = pd.DataFrame(res_cah_rfm, index=index)
res_cah_rfm
| Silhouette Score | Davies-Bouldin Score | |
|---|---|---|
| CAH_rfm | 0.39 | 0.87 |
DBSCAN (Density-Based Spatial Clustering of Applications with Noise) est un algorithme de regroupement non supervisé qui détecte des clusters de formes arbitraires en se basant sur la densité des points. Il fonctionne en définissant un rayon autour de chaque point et en recherchant les points voisins à l'intérieur de cette portée. Les points centraux sont ensuite utilisés pour étendre les clusters, tandis que les points isolés sont considérés comme du bruit. DBSCAN est efficace pour l'exploration de données, détecte les points aberrants et peut gérer de grandes quantités de données.
Définition des paramètres : DBSCAN nécessite principalement deux paramètres : le rayon (epsilon) et le nombre minimal d'échantillons (min_samples). Le rayon détermine la portée autour de chaque point, tandis que min_samples spécifie le nombre minimal de points à l'intérieur de cette portée pour qu'un point soit considéré comme un point central.
def explore_dbscan_params(X_data, eps_values, min_samples_values):
for eps in eps_values:
for min_samples in min_samples_values:
# Perform DBSCAN clustering
db = DBSCAN(eps=eps, min_samples=min_samples).fit(X_data)
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
# Calculate silhouette score
if (
n_clusters > 1
): # Silhouette score can only be computed when there are at least 2 clusters
silhouette = silhouette_score(X_data, labels)
else:
silhouette = np.nan
# Print results
print(f"Parameters: eps={eps}, min_samples={min_samples}")
print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise}")
print(f"Silhouette score: {silhouette}")
print()
X_scaled = X_scaled
eps_values = np.arange(0.1, 1, 0.2)
min_samples_values = np.arange(25, 30, 5)
explore_dbscan_params(X_scaled, eps_values, min_samples_values)
Parameters: eps=0.1, min_samples=25 Number of clusters: 14 Number of noise points: 4964 Silhouette score: 0.23048637601815816 Parameters: eps=0.30000000000000004, min_samples=25 Number of clusters: 4 Number of noise points: 1029 Silhouette score: 0.45869270803863915 Parameters: eps=0.5000000000000001, min_samples=25 Number of clusters: 3 Number of noise points: 494 Silhouette score: 0.7044507226811484 Parameters: eps=0.7000000000000001, min_samples=25 Number of clusters: 3 Number of noise points: 266 Silhouette score: 0.7050015531139879 Parameters: eps=0.9000000000000001, min_samples=25 Number of clusters: 3 Number of noise points: 180 Silhouette score: 0.7062545596298613
db = DBSCAN(eps=0.5, min_samples=25).fit(X_scaled)
db_labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(db_labels)) - (1 if -1 in db_labels else 0)
n_noise_ = list(db_labels).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
Estimated number of clusters: 3 Estimated number of noise points: 494
rfm_db = rfm.copy()
rfm_db["Clusters"] = db.labels_
rfm_db.head(3)
| Recency | Frequency | MonetaryValue | Clusters | |
|---|---|---|---|---|
| customer_unique_id | ||||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 1 | 141.90 | 0 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 1 | 27.19 | 0 |
| 0000f46a3911fa3c0805444483337064 | 585 | 1 | 86.22 | 0 |
rfm_db["Clusters"].value_counts()
Clusters 0 92873 1 2636 -1 494 2 92 Name: count, dtype: int64
plot_clusters(rfm_db, db_labels)
# Changement 'Clusters' à string
rfm_db["Clusters"] = rfm_db["Clusters"].astype(str)
# Visualization de resultat
fig = px.scatter_3d(
rfm_db, x="Recency", y="Frequency", z="MonetaryValue", color=rfm_db["Clusters"]
)
fig.show()
calculate_cluster_statistics(rfm_db)
| Recency | Frequency | MonetaryValue | Nb_of_clients(k) | |
|---|---|---|---|---|
| Clusters | ||||
| -1 | 349.582996 | 2.062753 | 1636.784190 | 0.494 |
| 0 | 288.056723 | 1.000000 | 156.250280 | 92.873 |
| 1 | 268.600910 | 2.000000 | 252.097079 | 2.636 |
| 2 | 174.663043 | 3.000000 | 264.571522 | 0.092 |
fig = plt.figure(figsize=(20, 8))
sns.set(font_scale=1.5)
plt.title("La Heatmap avec les Clusters DBScan RFM", size=25)
sns.heatmap(calculate_cluster_statistics(rfm_db), annot=True, cmap="Reds", fmt=".3f");
# Le cluster -1 contient des valeurs aberrantes selon DBScan.
# Compute the Silhouette Score
silhouette = silhouette_score(X_scaled, db_labels).round(2)
# Compute the Davies-Bouldin Score
davies_bouldin = davies_bouldin_score(X_scaled, db_labels).round(2)
res_dbscan_rfm = {
"Silhouette Score": silhouette,
"Davies-Bouldin Score": davies_bouldin,
}
index = ["DBScan_rfm"]
res_dbscan_rfm = pd.DataFrame(res_dbscan_rfm, index=index)
res_dbscan_rfm
| Silhouette Score | Davies-Bouldin Score | |
|---|---|---|
| DBScan_rfm | 0.7 | 1.16 |
sns.set(font_scale=1)
# Fit the DBSCAN model
db.fit(X_scaled)
# Compute the silhouette scores
silhouette_scores = silhouette_samples(X_scaled, db.labels_)
# Plot the silhouette scores
plt.figure(figsize=(10, 5))
y_lower = 10
for i in range(len(set(db.labels_)) - 1):
cluster_scores = silhouette_scores[db.labels_ == i]
cluster_scores.sort()
size_cluster_i = cluster_scores.shape[0]
y_upper = y_lower + size_cluster_i
color = plt.cm.Spectral(i / len(set(db.labels_)))
plt.fill_betweenx(
np.arange(y_lower, y_upper),
0,
cluster_scores,
facecolor=color,
edgecolor=color,
alpha=0.7,
)
plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
plt.xlabel("Silhouette Coefficient")
plt.ylabel("Cluster")
plt.title("Silhouette Analysis for DBSCAN Clustering")
plt.show()
Malgré le score élevé du coefficient de silhouette, le DBScan divise les clients en 3 clusters qui ne sont pas équilibrés.
# On essaye de faire un deuxième clustering pour le cluster 0
cluster0 = rfm_db[rfm_db.Clusters == "0"]
cluster0.head(2)
| Recency | Frequency | MonetaryValue | Clusters | |
|---|---|---|---|---|
| customer_unique_id | ||||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 1 | 141.90 | 0 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 1 | 27.19 | 0 |
X_0 = cluster0.drop("Clusters", axis=1).values
X_0_scaled = StandardScaler().fit_transform(X_0)
db_0 = DBSCAN(eps=0.5, min_samples=25).fit(X_0_scaled)
labels_0 = db_0.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels_0)) - (1 if -1 in labels_0 else 0)
n_noise_ = list(labels_0).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
Estimated number of clusters: 2 Estimated number of noise points: 36
rfm_db_0 = cluster0.drop("Clusters", axis=1).copy()
rfm_db_0["Clusters"] = labels_0
calculate_cluster_statistics(rfm_db_0)
| Recency | Frequency | MonetaryValue | Nb_of_clients(k) | |
|---|---|---|---|---|
| Clusters | ||||
| -1 | 222.611111 | 1.0 | 1905.973056 | 0.036 |
| 0 | 288.202684 | 1.0 | 154.037254 | 92.760 |
| 1 | 142.818182 | 1.0 | 2004.176364 | 0.077 |
res_rfm = pd.concat([res_kmeans_rfm, res_cah_rfm, res_dbscan_rfm], axis=0)
res_rfm
| Silhouette Score | Davies-Bouldin Score | |
|---|---|---|
| KMeans_rfm | 0.44 | 0.71 |
| CAH_rfm | 0.39 | 0.87 |
| DBScan_rfm | 0.70 | 1.16 |
rfm_plus.columns
Index(['Recency', 'Frequency', 'MonetaryValue', 'review_mean',
'deliver_duration_mean'],
dtype='object')
rfm_review = rfm_plus.drop(
["deliver_duration_mean"],
axis=1,
)
rfm_review.head(3)
| Recency | Frequency | MonetaryValue | review_mean | |
|---|---|---|---|---|
| customer_unique_id | ||||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 1 | 141.90 | 5.0 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 1 | 27.19 | 4.0 |
| 0000f46a3911fa3c0805444483337064 | 585 | 1 | 86.22 | 3.0 |
X_review = rfm_review
X_scaled_review = StandardScaler().fit_transform(X_review)
df_X_scaled_review = pd.DataFrame(
X_scaled_review, columns=rfm_review.columns, index=rfm_review.index
)
# Elbow pour choisir le nombre classe
model = KMeans(init="k-means++", random_state=3)
visualizer = KElbowVisualizer(model, k=(2, 20), timings=False, njobs=-1)
visualizer.fit(X_scaled_review)
visualizer.show();
# Avec la méthode coude on peut définir notre nombre de cluster
# Selon le graphique et pour rester le même nombre de cluster avec les autres dataframes on va utiliser 6 cluster
# Nombre de clusters souhaités
n_clust = 6
# Clustering par K-means
km_review_first = KMeans(n_clusters=n_clust, init="k-means++", random_state=3)
km_review_first.fit(X_scaled_review)
# Récupération des clusters attribués à chaque individu
clusters_km_review_first = km_review_first.labels_
plot_clusters(df_X_scaled_review, clusters_km_review_first)
rfm_km_review_first = rfm_review.copy()
rfm_km_review_first["Clusters"] = clusters_km_review_first
rfm_km_review_first.head(2)
| Recency | Frequency | MonetaryValue | review_mean | Clusters | |
|---|---|---|---|---|---|
| customer_unique_id | |||||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 1 | 141.90 | 5.0 | 1 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 1 | 27.19 | 4.0 | 1 |
calculate_cluster_statistics(rfm_km_review_first)
| Recency | Frequency | MonetaryValue | review_mean | Nb_of_clients(k) | |
|---|---|---|---|---|---|
| Clusters | |||||
| 0 | 287.309013 | 1.070815 | 2225.603863 | 3.638770 | 0.466 |
| 1 | 170.764903 | 1.000000 | 123.500653 | 4.671896 | 41.400 |
| 2 | 268.254294 | 2.115527 | 292.476079 | 4.127422 | 2.969 |
| 3 | 442.203270 | 1.000000 | 124.294562 | 4.635832 | 31.318 |
| 4 | 285.431785 | 1.000000 | 784.621780 | 4.189501 | 3.467 |
| 5 | 292.016571 | 1.000000 | 144.310254 | 1.587162 | 16.475 |
fig = plt.figure(figsize=(20, 8))
sns.set(font_scale=1.5)
plt.title("La Heatmap avec les Clusters KMeans RFM + Review", size=25)
sns.heatmap(
calculate_cluster_statistics(rfm_km_review_first),
annot=True,
cmap="Reds",
fmt=".3f",
);
# Changement 'Clusters' à string
rfm_km_review_first["Clusters"] = rfm_km_review_first["Clusters"].astype(str)
# Visualization de resultat
fig = px.scatter_3d(
rfm_km_review_first,
x="Recency",
y="Frequency",
z="MonetaryValue",
color=rfm_km_review_first["Clusters"],
)
fig.show()
plt.figure(figsize=(10, 5))
visualizer = SilhouetteVisualizer(km_review_first, colors="yellowbrick", njobs=-1)
visualizer.fit(X_scaled_review)
visualizer.poof();
Les clusters ont un score de silhouette supérieur à la moyenne.
Ils sont plutôt équilibrés, à l'exception des clusters qui contiennent des valeurs extrêmes.
# Compute the Silhouette Score
silhouette = silhouette_score(X_scaled_review, km_review_first.labels_).round(2)
# Compute the Davies-Bouldin Score
davies_bouldin = davies_bouldin_score(X_scaled_review, km_review_first.labels_).round(2)
res_kmeans_review = {
"Silhouette Score": silhouette,
"Davies-Bouldin Score": davies_bouldin,
}
index = ["KMeans_review"]
res_kmeans_review = pd.DataFrame(res_kmeans_review, index=index)
res_kmeans_review
| Silhouette Score | Davies-Bouldin Score | |
|---|---|---|
| KMeans_review | 0.43 | 0.78 |
rfm_plus_date.head(2)
| customer_unique_id | Recency | Frequency | MonetaryValue | review_mean | deliver_duration_mean | last_order_purchase_timestamp | customer_lat | customer_lng | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 1 | 141.90 | 5.0 | 6.0 | 2018-05-10 10:56:27 | -23.335331 | -46.828647 |
| 1 | 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 1 | 27.19 | 4.0 | 3.0 | 2018-05-07 11:11:27 | -23.567395 | -46.792957 |
map_review = rfm_km_review_first.copy()
map_review["latitude"] = rfm_plus_date.set_index("customer_unique_id").customer_lat
map_review["longitude"] = rfm_plus_date.set_index("customer_unique_id").customer_lng
map_review.head(2)
| Recency | Frequency | MonetaryValue | review_mean | Clusters | latitude | longitude | |
|---|---|---|---|---|---|---|---|
| customer_unique_id | |||||||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 1 | 141.90 | 5.0 | 1 | -23.335331 | -46.828647 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 1 | 27.19 | 4.0 | 1 | -23.567395 | -46.792957 |
map_review = map_review.dropna().copy()
def plot_clusters_on_map(data, cluster_column, lat_column, lon_column):
# Filter data for each cluster
clusters = data[cluster_column].unique()
filtered_data = []
for cluster in clusters:
filtered_data.append(data[data[cluster_column] == cluster])
# Create a scatter plot for each cluster using Plotly
fig = go.Figure()
for i, cluster_data in enumerate(filtered_data):
fig.add_trace(
go.Scattermapbox(
lat=cluster_data[lat_column],
lon=cluster_data[lon_column],
mode="markers",
marker=dict(color=i),
name=f"Cluster {clusters[i]}",
)
)
# Set the initial scope and center the map
fig.update_layout(
mapbox=dict(
style="open-street-map",
center=dict(lat=data[lat_column].mean(), lon=data[lon_column].mean()),
zoom=10,
),
showlegend=True,
height=600,
)
# Display the interactive plot
fig.show()
plot_clusters_on_map(map_review, "Clusters", "latitude", "longitude")
# La plupart des clients se situent sur la côte
rfm_exp_review = rfm_plus_exp.drop(
[
"last_order_purchase_timestamp",
"customer_lat",
"customer_lng",
"deliver_duration_mean",
],
axis=1,
)
rfm_exp_review.set_index("customer_unique_id", inplace=True)
rfm_exp_review.head(2)
| Recency | Frequency | MonetaryValue | review_mean | |
|---|---|---|---|---|
| customer_unique_id | ||||
| b55fb2799bf92225fc54fdb4cb7b82e2 | 318 | 1 | 33.72 | 5.0 |
| 91ed74cc8fc470081c5b71fe5016c89f | 199 | 1 | 783.28 | 5.0 |
X_scaled_rfm_exp_review = StandardScaler().fit_transform(rfm_exp_review)
df_X_scaled_rfm_exp_review = pd.DataFrame(
X_scaled_rfm_exp_review, columns=rfm_exp_review.columns, index=rfm_exp_review.index
)
df_X_scaled_rfm_exp_review.head(2)
| Recency | Frequency | MonetaryValue | review_mean | |
|---|---|---|---|---|
| customer_unique_id | ||||
| b55fb2799bf92225fc54fdb4cb7b82e2 | 0.211420 | -0.153988 | -0.597826 | 0.677115 |
| 91ed74cc8fc470081c5b71fe5016c89f | -0.564669 | -0.153988 | 2.805209 | 0.677115 |
# import de l'échantillon et des informations relatives aux cours
plt.rcParams["axes.grid"] = False
# préparation des données pour le clustering
names = rfm_exp_review.index
# Clustering hiérarchique
Z = linkage(X_scaled_rfm_exp_review, "ward")
# Affichage du dendrogramme
%pylab inline
plt.figure(figsize=(25, 8))
plt.title("Hierarchical Clustering Dendrogram", size=35)
plt.xlabel("distance")
dendrogram(Z, labels=names, leaf_font_size=10, orientation="top")
plt.axhline(y=16.4, color="r", linestyle="--")
plt.show()
%pylab is deprecated, use %matplotlib inline and import the required libraries. Populating the interactive namespace from numpy and matplotlib
%pylab inline
plt.rcParams["axes.grid"] = False
plt.figure(figsize=(10, 5))
plt.title("Hierarchical Clustering Dendrogram", size=20)
plt.xlabel("distance")
dendrogram(
Z,
labels=names,
p=6,
truncate_mode="lastp",
leaf_font_size=10,
orientation="top",
show_contracted=True,
)
plt.axhline(y=16.4, color="r", linestyle="--")
plt.show()
%pylab is deprecated, use %matplotlib inline and import the required libraries. Populating the interactive namespace from numpy and matplotlib
# On peut voir le nombre de clients par chaque cluster
# On voit les labels des clusters
labels_cah_review = fcluster(Z, 55, criterion="distance")
rfm_cah_review = rfm_exp_review.copy()
rfm_cah_review["Clusters"] = labels_cah_review
calculate_cluster_statistics(rfm_cah_review)
| Recency | Frequency | MonetaryValue | review_mean | Nb_of_clients(k) | |
|---|---|---|---|---|---|
| Clusters | |||||
| 1 | 272.556701 | 2.137457 | 310.761615 | 4.166189 | 0.291 |
| 2 | 240.105263 | 1.000000 | 1473.080451 | 3.466165 | 0.133 |
| 3 | 315.479023 | 1.000000 | 145.973016 | 1.730749 | 1.883 |
| 4 | 486.596741 | 1.000000 | 112.558534 | 4.768330 | 1.964 |
| 5 | 275.877370 | 1.000000 | 451.102592 | 4.654867 | 0.791 |
| 6 | 197.779263 | 1.000000 | 104.268613 | 4.638113 | 4.938 |
fig = plt.figure(figsize=(20, 8))
sns.set(font_scale=1.5)
plt.title("La Heatmap avec les Clusters avec CAH", size=25)
sns.heatmap(
calculate_cluster_statistics(rfm_cah_review), annot=True, cmap="Reds", fmt=".3f"
);
# Certaines clusters ont la même définition que celles obtenues avec KMeans, ce qui est rassurant pour nous.
# Compute the Silhouette Score
silhouette = silhouette_score(X_scaled_rfm_exp_review, labels_cah_review).round(2)
# Compute the Davies-Bouldin Score
davies_bouldin = davies_bouldin_score(X_scaled_rfm_exp_review, labels_cah_review).round(
2
)
res_cah_review = {
"Silhouette Score": silhouette,
"Davies-Bouldin Score": davies_bouldin,
}
index = ["CAH_review"]
res_cah_review = pd.DataFrame(res_cah_review, index=index)
res_cah_review
| Silhouette Score | Davies-Bouldin Score | |
|---|---|---|
| CAH_review | 0.37 | 0.94 |
eps_values = np.arange(0.1, 1, 0.2)
min_samples_values = np.arange(25, 30, 5)
explore_dbscan_params(X_scaled_review, eps_values, min_samples_values)
Parameters: eps=0.1, min_samples=25 Number of clusters: 14 Number of noise points: 10864 Silhouette score: 0.012624936859960741 Parameters: eps=0.30000000000000004, min_samples=25 Number of clusters: 10 Number of noise points: 3302 Silhouette score: 0.09593509628425957 Parameters: eps=0.5000000000000001, min_samples=25 Number of clusters: 7 Number of noise points: 1364 Silhouette score: 0.29819828414699334 Parameters: eps=0.7000000000000001, min_samples=25 Number of clusters: 3 Number of noise points: 774 Silhouette score: 0.613780979896961 Parameters: eps=0.9000000000000001, min_samples=25 Number of clusters: 3 Number of noise points: 348 Silhouette score: 0.6147734364522617
db_review = DBSCAN(eps=0.7, min_samples=25).fit(X_scaled_review)
db_labels_review = db_review.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(db_labels_review)) - (1 if -1 in db_labels_review else 0)
n_noise_ = list(db_labels_review).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
Estimated number of clusters: 3 Estimated number of noise points: 774
rfm_db_review = rfm_review.copy()
rfm_db_review["Clusters"] = db_review.labels_
rfm_db_review.head(3)
| Recency | Frequency | MonetaryValue | review_mean | Clusters | |
|---|---|---|---|---|---|
| customer_unique_id | |||||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 1 | 141.90 | 5.0 | 0 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 1 | 27.19 | 4.0 | 0 |
| 0000f46a3911fa3c0805444483337064 | 585 | 1 | 86.22 | 3.0 | 0 |
rfm_db_review["Clusters"].value_counts()
Clusters 0 92651 1 2593 -1 774 2 77 Name: count, dtype: int64
plot_clusters(rfm_db_review, db_labels_review)
# Changement 'Clusters' à string
rfm_db_review["Clusters"] = rfm_db_review["Clusters"].astype(str)
# Visualization de resultat
fig = px.scatter_3d(
rfm_db_review,
x="Recency",
y="Frequency",
z="MonetaryValue",
color=rfm_db_review["Clusters"],
)
fig.show()
calculate_cluster_statistics(rfm_db_review)
| Recency | Frequency | MonetaryValue | review_mean | Nb_of_clients(k) | |
|---|---|---|---|---|---|
| Clusters | |||||
| -1 | 319.244186 | 1.77261 | 1509.451434 | 3.178773 | 0.774 |
| 0 | 288.123021 | 1.00000 | 153.079826 | 4.097301 | 92.651 |
| 1 | 267.522175 | 2.00000 | 246.276236 | 4.159468 | 2.593 |
| 2 | 179.493506 | 3.00000 | 246.278701 | 4.676768 | 0.077 |
# Compute the Silhouette Score
silhouette = silhouette_score(X_scaled_review, db_labels).round(2)
# Compute the Davies-Bouldin Score
davies_bouldin = davies_bouldin_score(X_scaled_review, db_labels).round(2)
res_dbscan_review = {
"Silhouette Score": silhouette,
"Davies-Bouldin Score": davies_bouldin,
}
index = ["DBScan_review"]
res_dbscan_review = pd.DataFrame(res_dbscan_review, index=index)
res_dbscan_review
| Silhouette Score | Davies-Bouldin Score | |
|---|---|---|
| DBScan_review | 0.61 | 1.23 |
sns.set(font_scale=1)
# Fit the DBSCAN model
db.fit(X_scaled_review)
# Compute the silhouette scores
silhouette_scores = silhouette_samples(X_scaled_review, db_review.labels_)
# Plot the silhouette scores
plt.figure(figsize=(10, 5))
y_lower = 10
for i in range(len(set(db_review.labels_)) - 1):
cluster_scores = silhouette_scores[db_review.labels_ == i]
cluster_scores.sort()
size_cluster_i = cluster_scores.shape[0]
y_upper = y_lower + size_cluster_i
color = plt.cm.Spectral(i / len(set(db_review.labels_)))
plt.fill_betweenx(
np.arange(y_lower, y_upper),
0,
cluster_scores,
facecolor=color,
edgecolor=color,
alpha=0.7,
)
plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
plt.xlabel("Silhouette Coefficient")
plt.ylabel("Cluster")
plt.title("Silhouette Analysis for DBSCAN Clustering")
plt.show()
Malgré le score élevé du coefficient de silhouette, le DBScan divise les clients en 3 clusters qui ne sont pas équilibrés.
cluster0_review = rfm_db_review[rfm_db_review.Clusters == "0"]
cluster0_review.head(2)
| Recency | Frequency | MonetaryValue | review_mean | Clusters | |
|---|---|---|---|---|---|
| customer_unique_id | |||||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 1 | 141.90 | 5.0 | 0 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 1 | 27.19 | 4.0 | 0 |
X_0_review = cluster0_review.drop("Clusters", axis=1).values
X_0_scaled_review = StandardScaler().fit_transform(X_0_review)
db_0_review = DBSCAN(eps=0.5, min_samples=25).fit(X_0_scaled_review)
labels_0_review = db_0_review.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels_0_review)) - (1 if -1 in labels_0_review else 0)
n_noise_ = list(labels_0_review).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
Estimated number of clusters: 5 Estimated number of noise points: 603
rfm_db_0_review = cluster0_review.drop("Clusters", axis=1).copy()
rfm_db_0_review["Clusters"] = labels_0_review
calculate_cluster_statistics(rfm_db_0_review)
| Recency | Frequency | MonetaryValue | review_mean | Nb_of_clients(k) | |
|---|---|---|---|---|---|
| Clusters | |||||
| -1 | 372.114428 | 1.0 | 1058.591625 | 3.300166 | 0.603 |
| 0 | 286.452832 | 1.0 | 146.973414 | 4.103738 | 91.789 |
| 1 | 739.945946 | 1.0 | 82.993784 | 4.000000 | 0.037 |
| 2 | 741.600000 | 1.0 | 114.126857 | 1.000000 | 0.070 |
| 3 | 740.032000 | 1.0 | 118.212240 | 5.000000 | 0.125 |
| 4 | 203.259259 | 1.0 | 1047.754815 | 4.000000 | 0.027 |
fig = plt.figure(figsize=(20, 8))
sns.set(font_scale=1.5)
plt.title("La Heatmap DBScan pour le Cluster 0", size=25)
sns.heatmap(
calculate_cluster_statistics(rfm_db_0_review), annot=True, cmap="Reds", fmt=".3f"
);
# Le cluster -1 contient des valeurs aberrantes selon DBScan.
res_review = pd.concat([res_kmeans_review, res_cah_review, res_dbscan_review], axis=0)
res_review
| Silhouette Score | Davies-Bouldin Score | |
|---|---|---|
| KMeans_review | 0.43 | 0.78 |
| CAH_review | 0.37 | 0.94 |
| DBScan_review | 0.61 | 1.23 |
X_plus = rfm_plus
X_scaled_plus = StandardScaler().fit_transform(X_plus)
df_X_scaled_plus = pd.DataFrame(
X_scaled_plus, columns=rfm_plus.columns, index=rfm_plus.index
)
# Elbow pour choisir le nombre classe
model = KMeans(init="k-means++", random_state=3)
visualizer = KElbowVisualizer(model, k=(2, 20), timings=False, njobs=-1)
visualizer.fit(X_scaled_plus)
visualizer.show();
# Avec la méthode coude on peut définir notre nombre de cluster
# Nombre de clusters souhaités
n_clust = 6
# Clustering par K-means
km_plus_first = KMeans(n_clusters=n_clust, init="k-means++", random_state=3)
km_plus_first.fit(X_scaled_plus)
# Récupération des clusters attribués à chaque individu
clusters_km_plus_first = km_plus_first.labels_
plot_clusters(df_X_scaled_plus, clusters_km_plus_first)
rfm_km_plus_first = rfm_plus.copy()
rfm_km_plus_first["Clusters"] = clusters_km_plus_first
rfm_km_plus_first.head(2)
| Recency | Frequency | MonetaryValue | review_mean | deliver_duration_mean | Clusters | |
|---|---|---|---|---|---|---|
| customer_unique_id | ||||||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 1 | 141.90 | 5.0 | 6.0 | 1 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 1 | 27.19 | 4.0 | 3.0 | 1 |
calculate_cluster_statistics(rfm_km_plus_first)
| Recency | Frequency | MonetaryValue | review_mean | deliver_duration_mean | Nb_of_clients(k) | |
|---|---|---|---|---|---|---|
| Clusters | ||||||
| 0 | 441.155385 | 1.000000 | 135.379627 | 4.649164 | 11.035269 | 31.586 |
| 1 | 169.368211 | 1.000000 | 132.627422 | 4.690642 | 9.399893 | 41.001 |
| 2 | 286.387770 | 1.000360 | 169.917514 | 2.216007 | 37.793345 | 5.560 |
| 3 | 268.332095 | 2.116554 | 290.381693 | 4.128372 | 11.764547 | 2.960 |
| 4 | 293.527299 | 1.000000 | 150.146482 | 1.665977 | 11.740639 | 13.059 |
| 5 | 285.710213 | 1.019699 | 1311.498294 | 4.022032 | 12.881458 | 1.929 |
fig = plt.figure(figsize=(20, 8))
sns.set(font_scale=1.5)
plt.title("La Heatmap avec les Clusters KMeans", size=25)
sns.heatmap(
calculate_cluster_statistics(rfm_km_plus_first), annot=True, cmap="Reds", fmt=".3f"
);
# Changement 'Clusters' à string
rfm_km_plus_first["Clusters"] = rfm_km_plus_first["Clusters"].astype(str)
# Visualization de resultat
fig = px.scatter_3d(
rfm_km_plus_first,
x="Recency",
y="Frequency",
z="MonetaryValue",
color=rfm_km_plus_first["Clusters"],
)
fig.show()
plt.figure(figsize=(10, 5))
visualizer = SilhouetteVisualizer(km_plus_first, colors="yellowbrick", njobs=-1)
visualizer.fit(X_scaled_plus)
visualizer.poof();
Les clusters ont un score de silhouette supérieur à la moyenne.
Ils sont plutôt équilibrés, à l'exception des clusters qui contiennent des valeurs extrêmes.
# Compute the Silhouette Score
silhouette = silhouette_score(X_scaled_plus, km_plus_first.labels_).round(2)
# Compute the Davies-Bouldin Score
davies_bouldin = davies_bouldin_score(X_scaled_plus, km_plus_first.labels_).round(2)
res_kmeans_plus = {
"Silhouette Score": silhouette,
"Davies-Bouldin Score": davies_bouldin,
}
index = ["Kmeans_plus"]
res_kmeans_plus = pd.DataFrame(res_kmeans_plus, index=index)
res_kmeans_plus
| Silhouette Score | Davies-Bouldin Score | |
|---|---|---|
| Kmeans_plus | 0.34 | 0.98 |
map_plus = rfm_km_plus_first.copy()
map_plus["latitude"] = rfm_plus_date.set_index("customer_unique_id").customer_lat
map_plus["longitude"] = rfm_plus_date.set_index("customer_unique_id").customer_lng
map_plus.head(2)
| Recency | Frequency | MonetaryValue | review_mean | deliver_duration_mean | Clusters | latitude | longitude | |
|---|---|---|---|---|---|---|---|---|
| customer_unique_id | ||||||||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 1 | 141.90 | 5.0 | 6.0 | 1 | -23.335331 | -46.828647 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 1 | 27.19 | 4.0 | 3.0 | 1 | -23.567395 | -46.792957 |
map_plus = map_plus.dropna().copy()
plot_clusters_on_map(map_plus, "Clusters", "latitude", "longitude")
# La plupart des clients se situent sur la côte
rfm_exp_plus = rfm_plus_exp.drop(
["last_order_purchase_timestamp", "customer_lat", "customer_lng"], axis=1
)
rfm_exp_plus.set_index("customer_unique_id", inplace=True)
rfm_exp_plus.head(2)
| Recency | Frequency | MonetaryValue | review_mean | deliver_duration_mean | |
|---|---|---|---|---|---|
| customer_unique_id | |||||
| b55fb2799bf92225fc54fdb4cb7b82e2 | 318 | 1 | 33.72 | 5.0 | 5.0 |
| 91ed74cc8fc470081c5b71fe5016c89f | 199 | 1 | 783.28 | 5.0 | 9.0 |
X_scaled_rfm_exp_plus = StandardScaler().fit_transform(rfm_exp_plus)
df_X_scaled_rfm_exp_plus = pd.DataFrame(
X_scaled_rfm_exp_plus, columns=rfm_exp_plus.columns, index=rfm_exp_plus.index
)
df_X_scaled_rfm_exp_plus.head(2)
| Recency | Frequency | MonetaryValue | review_mean | deliver_duration_mean | |
|---|---|---|---|---|---|
| customer_unique_id | |||||
| b55fb2799bf92225fc54fdb4cb7b82e2 | 0.211420 | -0.153988 | -0.597826 | 0.677115 | -0.746587 |
| 91ed74cc8fc470081c5b71fe5016c89f | -0.564669 | -0.153988 | 2.805209 | 0.677115 | -0.318976 |
# import de l'échantillon et des informations relatives aux cours
plt.rcParams["axes.grid"] = False
# préparation des données pour le clustering
names = rfm_exp_plus.index
# Clustering hiérarchique
Z = linkage(X_scaled_rfm_exp_plus, "ward")
# Affichage du dendrogramme
%pylab inline
plt.figure(figsize=(25, 8))
plt.title("Hierarchical Clustering Dendrogram", size=35)
plt.xlabel("distance")
dendrogram(Z, labels=names, leaf_font_size=10, orientation="top")
plt.axhline(y=16.4, color="r", linestyle="--")
plt.show()
%pylab is deprecated, use %matplotlib inline and import the required libraries. Populating the interactive namespace from numpy and matplotlib
%pylab inline
plt.rcParams["axes.grid"] = False
plt.figure(figsize=(10, 5))
plt.title("Hierarchical Clustering Dendrogram", size=20)
plt.xlabel("distance")
dendrogram(
Z,
labels=names,
p=6,
truncate_mode="lastp",
leaf_font_size=10,
orientation="top",
show_contracted=True,
)
plt.axhline(y=16.4, color="r", linestyle="--")
plt.show()
%pylab is deprecated, use %matplotlib inline and import the required libraries. Populating the interactive namespace from numpy and matplotlib
# On peut voir le nombre de clients par chaque cluster
# On voit les labels des clusters
labels_cah_plus = fcluster(Z, 65, criterion="distance")
rfm_cah_plus = rfm_exp_plus.copy()
rfm_cah_plus["Clusters"] = labels_cah_plus
calculate_cluster_statistics(rfm_cah_plus)
| Recency | Frequency | MonetaryValue | review_mean | deliver_duration_mean | Nb_of_clients(k) | |
|---|---|---|---|---|---|---|
| Clusters | ||||||
| 1 | 457.773383 | 1.000000 | 127.539338 | 4.405915 | 10.612200 | 2.705 |
| 2 | 196.125286 | 1.000000 | 124.270042 | 4.669050 | 10.477117 | 5.244 |
| 3 | 272.556701 | 2.137457 | 310.761615 | 4.166189 | 11.937915 | 0.291 |
| 4 | 274.469496 | 1.000000 | 963.585491 | 3.864721 | 14.129973 | 0.377 |
| 5 | 307.911280 | 1.000000 | 140.352763 | 1.256654 | 9.277567 | 0.789 |
| 6 | 274.974747 | 1.000000 | 156.357475 | 1.378788 | 33.786195 | 0.594 |
fig = plt.figure(figsize=(20, 8))
sns.set(font_scale=1.5)
plt.title("La Heatmap avec les Clusters avec CAH", size=25)
sns.heatmap(
calculate_cluster_statistics(rfm_cah_plus), annot=True, cmap="Reds", fmt=".3f"
);
# Certaines clusters ont la même définition que celles obtenues avec KMeans, ce qui est rassurant pour nous.
# Compute the Silhouette Score
silhouette = silhouette_score(X_scaled_rfm_exp_plus, labels_cah_plus).round(2)
# Compute the Davies-Bouldin Score
davies_bouldin = davies_bouldin_score(X_scaled_rfm_exp_plus, labels_cah_plus).round(2)
res_cah_plus = {"Silhouette Score": silhouette, "Davies-Bouldin Score": davies_bouldin}
index = ["CAH_plus"]
res_cah_plus = pd.DataFrame(res_cah_plus, index=index)
res_cah_plus
| Silhouette Score | Davies-Bouldin Score | |
|---|---|---|
| CAH_plus | 0.3 | 1.05 |
eps_values = np.arange(0.1, 1, 0.2)
min_samples_values = np.arange(25, 30, 5)
explore_dbscan_params(X_scaled_plus, eps_values, min_samples_values)
Parameters: eps=0.1, min_samples=25 Number of clusters: 111 Number of noise points: 59680 Silhouette score: -0.5265713419317958 Parameters: eps=0.30000000000000004, min_samples=25 Number of clusters: 8 Number of noise points: 10083 Silhouette score: 0.0180201954540568 Parameters: eps=0.5000000000000001, min_samples=25 Number of clusters: 8 Number of noise points: 4064 Silhouette score: 0.19004583128264677 Parameters: eps=0.7000000000000001, min_samples=25 Number of clusters: 3 Number of noise points: 2050 Silhouette score: 0.5524995410548734 Parameters: eps=0.9000000000000001, min_samples=25 Number of clusters: 4 Number of noise points: 931 Silhouette score: 0.5433949058713025
db_plus = DBSCAN(eps=0.7, min_samples=25).fit(X_scaled_plus)
labels_plus = db_plus.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels_plus)) - (1 if -1 in labels_plus else 0)
n_noise_ = list(labels_plus).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
Estimated number of clusters: 3 Estimated number of noise points: 2050
# Compute the Silhouette Score
silhouette = silhouette_score(X_scaled_plus, labels_plus).round(2)
# Compute the Davies-Bouldin Score
davies_bouldin = davies_bouldin_score(X_scaled_plus, labels_plus).round(2)
res_dbscan_plus = {
"Silhouette Score": silhouette,
"Davies-Bouldin Score": davies_bouldin,
}
index = ["DBScan_plus"]
res_dbscan_plus = pd.DataFrame(res_dbscan_plus, index=index)
res_dbscan_plus
| Silhouette Score | Davies-Bouldin Score | |
|---|---|---|
| DBScan_plus | 0.55 | 1.34 |
plot_clusters(df_X_scaled_plus, labels_plus)
rfm_db_plus = rfm_plus.copy()
rfm_db_plus["Clusters"] = labels_plus
rfm_db_plus.head(2)
| Recency | Frequency | MonetaryValue | review_mean | deliver_duration_mean | Clusters | |
|---|---|---|---|---|---|---|
| customer_unique_id | ||||||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 1 | 141.90 | 5.0 | 6.0 | 0 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 1 | 27.19 | 4.0 | 3.0 | 0 |
# Changement 'Clusters' à string
rfm_db_plus["Clusters"] = rfm_db_plus["Clusters"].astype(str)
# Visualization de resultat
fig = px.scatter_3d(
rfm_db_plus,
x="Recency",
y="Frequency",
z="MonetaryValue",
color=rfm_db_plus["Clusters"],
)
fig.show()
Quand on exclue les valeurs extrêmes le modèle fait des segmentation par rapport à fréquence d’achat
calculate_cluster_statistics(rfm_db_plus)
| Recency | Frequency | MonetaryValue | review_mean | deliver_duration_mean | Nb_of_clients(k) | |
|---|---|---|---|---|---|---|
| Clusters | ||||||
| -1 | 336.681463 | 1.465366 | 974.210483 | 3.074137 | 27.299704 | 2.050 |
| 0 | 287.299628 | 1.000000 | 146.850905 | 4.107266 | 11.735139 | 91.697 |
| 1 | 263.749241 | 2.000000 | 232.537132 | 4.377440 | 10.701518 | 2.305 |
| 2 | 158.930233 | 3.000000 | 231.513953 | 4.860465 | 8.837209 | 0.043 |
fig = plt.figure(figsize=(20, 8))
sns.set(font_scale=1.5)
plt.title("La Heatmap avec les Clusters avec DBScan", size=25)
sns.heatmap(
calculate_cluster_statistics(rfm_db_plus), annot=True, cmap="Reds", fmt=".3f"
);
# Le cluster -1 contient des valeurs aberrantes selon DBScan.
cluster0_plus = rfm_db_plus[rfm_db_plus.Clusters == "0"]
cluster0_plus.head(2)
| Recency | Frequency | MonetaryValue | review_mean | deliver_duration_mean | Clusters | |
|---|---|---|---|---|---|---|
| customer_unique_id | ||||||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 1 | 141.90 | 5.0 | 6.0 | 0 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 1 | 27.19 | 4.0 | 3.0 | 0 |
X_0_plus = cluster0_plus.drop("Clusters", axis=1).values
X_0_scaled_plus = StandardScaler().fit_transform(X_0_plus)
db_0_plus = DBSCAN(eps=0.5, min_samples=25).fit(X_0_scaled_plus)
labels_0_plus = db_0_plus.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels_0_plus)) - (1 if -1 in labels_0_plus else 0)
n_noise_ = list(labels_0_plus).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
Estimated number of clusters: 4 Estimated number of noise points: 2812
rfm_db_0_plus = cluster0_plus.drop("Clusters", axis=1).copy()
rfm_db_0_plus["Clusters"] = labels_0_plus
calculate_cluster_statistics(rfm_db_0_plus)
| Recency | Frequency | MonetaryValue | review_mean | deliver_duration_mean | Nb_of_clients(k) | |
|---|---|---|---|---|---|---|
| Clusters | ||||||
| -1 | 335.060811 | 1.0 | 583.224381 | 3.216216 | 21.959459 | 2.812 |
| 0 | 285.238392 | 1.0 | 132.954915 | 4.136158 | 11.414795 | 88.757 |
| 1 | 742.054054 | 1.0 | 97.465676 | 1.000000 | 9.972973 | 0.037 |
| 2 | 739.590909 | 1.0 | 92.887727 | 5.000000 | 9.484848 | 0.066 |
| 3 | 366.000000 | 1.0 | 613.771200 | 4.000000 | 7.560000 | 0.025 |
fig = plt.figure(figsize=(20, 8))
sns.set(font_scale=1.5)
plt.title("La Heatmap avec les Clusters avec DBScan", size=25)
sns.heatmap(
calculate_cluster_statistics(rfm_db_0_plus), annot=True, cmap="Reds", fmt=".3f"
);
res_plus = pd.concat([res_kmeans_plus, res_cah_plus, res_dbscan_plus], axis=0)
res_plus
| Silhouette Score | Davies-Bouldin Score | |
|---|---|---|
| Kmeans_plus | 0.34 | 0.98 |
| CAH_plus | 0.30 | 1.05 |
| DBScan_plus | 0.55 | 1.34 |
res_total = pd.concat([res_rfm, res_review, res_plus], axis=0)
res_total
| Silhouette Score | Davies-Bouldin Score | |
|---|---|---|
| KMeans_rfm | 0.44 | 0.71 |
| CAH_rfm | 0.39 | 0.87 |
| DBScan_rfm | 0.70 | 1.16 |
| KMeans_review | 0.43 | 0.78 |
| CAH_review | 0.37 | 0.94 |
| DBScan_review | 0.61 | 1.23 |
| Kmeans_plus | 0.34 | 0.98 |
| CAH_plus | 0.30 | 1.05 |
| DBScan_plus | 0.55 | 1.34 |
On a comparé les clusters par rapport aux scores silhouettes Davies- Bouldin et l’approche métier. Le modèle KMeans avec des RFM+Review+Deliver duration avec 6 clusters a donné les meilleurs résultats côté métier malgré les résultats des Silhouette et Davies Bouildin scores.
acp_vars = rfm_plus.columns.to_list()
X_plus = rfm_plus
keep_ind = X_plus.index
X_scaled_plus = StandardScaler().fit_transform(X_plus)
# On voit les sommes cumulative des variances expliquées
from sklearn import decomposition
pca = decomposition.PCA()
pca.fit(X_scaled_plus)
print(pca.explained_variance_ratio_.cumsum())
[0.26742808 0.49226198 0.68811676 0.86116876 1. ]
On effectue une ACP et affiche l'éboulis des valeurs propres.
display_scree_plot(pca)
On peut réduire le dataset à composantes avec près de 85 % des données.
pcs = pca.components_
display_circles(pcs, 4, pca, [(0, 1), (2, 3)], labels=np.array(acp_vars))
# Projection des individus
X_projected_plus = pca.transform(X_scaled_plus)
display_factorial_planes(
X_projected_plus, len(acp_vars), pca, [(0, 1), (2, 3)], labels=None
)
plt.show()
# On va réduire notre jeu de données en 3 composantes
n_components = 3
pca = decomposition.PCA(n_components=3).fit(X_scaled_plus)
X_projected_plus = pca.transform(X_scaled_plus)
df_X_projected_plus = pd.DataFrame(
X_projected_plus,
columns=["comp_" + str(i + 1) for i in range(n_components)],
index=rfm.index,
)
df_X_projected_plus.head(2)
| comp_1 | comp_2 | comp_3 | |
|---|---|---|---|
| customer_unique_id | |||
| 0000366f3b9a7992bf8c76cfdf3221e2 | -1.104122 | 0.153793 | -0.550450 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | -0.948467 | -0.204809 | -0.809009 |
model = KMeans(init="k-means++", random_state=3)
visualizer = KElbowVisualizer(model, k=(2, 20), timings=False)
visualizer.fit(X_projected_plus)
visualizer.show();
# Avec la méthode coude on peut définir notre nombre de cluster
# Selon le graphique et pour rester le même nombre de cluster avec les autres dataframes on va utiliser 6 cluster
# Nombre de clusters souhaités
n_clust = 6
# Clustering par K-means
km_plus = KMeans(n_clusters=n_clust, init="k-means++", random_state=3)
km_plus.fit(X_projected_plus)
# Récupération des clusters attribués à chaque individu
clusters_km_plus = km_plus.labels_
plot_clusters(df_X_projected_plus, clusters_km_plus)
rfm_km_plus = rfm_plus.copy()
rfm_km_plus["Clusters"] = clusters_km_plus
rfm_km_plus.head(2)
| Recency | Frequency | MonetaryValue | review_mean | deliver_duration_mean | Clusters | |
|---|---|---|---|---|---|---|
| customer_unique_id | ||||||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 1 | 141.90 | 5.0 | 6.0 | 1 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 1 | 27.19 | 4.0 | 3.0 | 1 |
calculate_cluster_statistics(rfm_km_plus)
| Recency | Frequency | MonetaryValue | review_mean | deliver_duration_mean | Nb_of_clients(k) | |
|---|---|---|---|---|---|---|
| Clusters | ||||||
| 0 | 518.289183 | 1.000000 | 144.107623 | 4.419123 | 11.372293 | 17.916 |
| 1 | 131.615062 | 1.000000 | 132.639100 | 4.684298 | 7.508876 | 27.659 |
| 2 | 304.436889 | 1.001521 | 190.860465 | 1.353682 | 37.683576 | 4.603 |
| 3 | 272.459578 | 1.773222 | 672.499669 | 4.118481 | 11.885081 | 4.317 |
| 4 | 317.473161 | 1.000000 | 130.238059 | 4.726415 | 10.290377 | 25.429 |
| 5 | 251.865747 | 1.000000 | 164.791274 | 2.491528 | 16.029621 | 16.171 |
fig = plt.figure(figsize=(20, 8))
sns.set(font_scale=1.5)
plt.title("La Heatmap avec les Clusters avec ACP", size=25)
sns.heatmap(
calculate_cluster_statistics(rfm_km_plus), annot=True, cmap="Reds", fmt=".3f"
);
Les résultats des clusters n'ont pas beaucoup changé car nous disposons de peu de variables pour effectuer une ACP.
df_3comp = pd.DataFrame(X_projected_plus, columns=["comp_1", "comp_2", "comp_3"])
df_3comp["Clusters"] = clusters_km_plus
df_3comp.head()
| comp_1 | comp_2 | comp_3 | Clusters | |
|---|---|---|---|---|
| 0 | -1.104122 | 0.153793 | -0.550450 | 1 |
| 1 | -0.948467 | -0.204809 | -0.809009 | 1 |
| 2 | 1.826431 | -1.043134 | 1.361289 | 0 |
| 3 | 0.609716 | -0.682477 | 0.306670 | 5 |
| 4 | -0.275935 | -0.060003 | 0.486988 | 4 |
# Changement 'Clusters' à string
df_3comp["Clusters"] = df_3comp["Clusters"].astype(str)
# Visualization de resultat
fig = px.scatter_3d(
df_3comp,
x="comp_1",
y="comp_2",
z="comp_3",
color=df_3comp["Clusters"],
)
fig.show()
# Compute the Silhouette Score
silhouette = silhouette_score(X_projected_plus, km_plus.labels_)
print("Silhouette Score:", silhouette)
# Compute the Davies-Bouldin Score
davies_bouldin = davies_bouldin_score(X_projected_plus, km_plus.labels_)
print("Davies-Bouldin Score:", davies_bouldin)
Silhouette Score: 0.33338069841739304 Davies-Bouldin Score: 0.9392606100998574
plt.figure(figsize=(10, 5))
visualizer = SilhouetteVisualizer(km_plus, colors="yellowbrick")
visualizer.fit(X_projected_plus)
visualizer.poof()
<Axes: title={'center': 'Silhouette Plot of KMeans Clustering for 96095 Samples in 6 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
db_plus_acp = DBSCAN(eps=0.5, min_samples=25).fit(X_projected_plus)
labels_plus_acp = db_plus_acp.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels_plus_acp)) - (1 if -1 in labels_plus_acp else 0)
n_noise_ = list(labels_plus_acp).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
Estimated number of clusters: 2 Estimated number of noise points: 1093
# Compute the Silhouette Score
silhouette = silhouette_score(X_projected_plus, labels_plus_acp)
print("Silhouette Score:", silhouette)
# Compute the Davies-Bouldin Score
davies_bouldin = davies_bouldin_score(X_projected_plus, labels_plus_acp)
print("Davies-Bouldin Score:", davies_bouldin)
Silhouette Score: 0.6809625604247442 Davies-Bouldin Score: 1.0872098774362855
plot_clusters(df_X_projected_plus, labels_plus_acp)
rfm_db_plus_acp = rfm_plus.copy()
rfm_db_plus_acp["Clusters"] = labels_plus_acp
rfm_db_plus_acp.head()
| Recency | Frequency | MonetaryValue | review_mean | deliver_duration_mean | Clusters | |
|---|---|---|---|---|---|---|
| customer_unique_id | ||||||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 1 | 141.90 | 5.0 | 6.0 | 0 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 1 | 27.19 | 4.0 | 3.0 | 0 |
| 0000f46a3911fa3c0805444483337064 | 585 | 1 | 86.22 | 3.0 | 25.0 | 0 |
| 0000f6ccb0745a6a4b88665a16c9f078 | 369 | 1 | 43.62 | 4.0 | 20.0 | 0 |
| 0004aac84e0df4da2b147fca70cf8255 | 336 | 1 | 196.89 | 5.0 | 13.0 | 0 |
rfm_db_plus_acp["Clusters"].value_counts()
Clusters 0 94969 -1 1093 1 33 Name: count, dtype: int64
# Changement 'Clusters' à string
rfm_db_plus_acp["Clusters"] = rfm_db_plus_acp["Clusters"].astype(str)
# Visualization de resultat
fig = px.scatter_3d(
rfm_db_plus_acp,
x="Recency",
y="Frequency",
z="MonetaryValue",
color=rfm_db_plus_acp["Clusters"],
)
fig.show()
calculate_cluster_statistics(rfm_db_plus_acp)
| Recency | Frequency | MonetaryValue | review_mean | deliver_duration_mean | Nb_of_clients(k) | |
|---|---|---|---|---|---|---|
| Clusters | ||||||
| -1 | 338.343092 | 1.672461 | 1166.473321 | 3.025753 | 30.443483 | 1.093 |
| 0 | 287.182502 | 1.026788 | 155.060949 | 4.104060 | 11.830460 | 94.969 |
| 1 | 189.181818 | 3.000000 | 240.410909 | 4.828283 | 8.686869 | 0.033 |
fig = plt.figure(figsize=(20, 8))
sns.set(font_scale=1.5)
plt.title("La Heatmap avec les Clusters avec ACP", size=25)
sns.heatmap(
calculate_cluster_statistics(rfm_db_plus_acp), annot=True, cmap="Reds", fmt=".3f"
);
# Le cluster -1 contient des valeurs aberrantes selon DBScan.
Les résultats des clusters n'ont pas beaucoup changé car nous disposons de peu de variables pour effectuer une ACP.